We will examine the file data/college_data.csv. The file contains more than 160 different variables for the 395 doctoral universities in the US. The data was obtained from Integrated Postsecondary Education Data System (IPEDS). The variables correspond the most recent available academic year. In most of the cases, this mean 2018-19. However, some variables are from the 2017-18 year.
knitr::opts_chunk$set(fig.align='center')
knitr::opts_chunk$set(out.width='100%')
utils.R.source('utils/utils.R')
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
data <- read.csv('data/college_data.csv')
ID <- "Institution.Name"
SZ <- "Total..enrollment..DRVEF2018."
carnegie <- "Carnegie.Classification..HD2018."
landgrant <- "Land.Grant.Institution..HDNo0Yes8."
sector <- "Sector.of.institution..HDPrivate0Public8."
carnegie: Carnegie Classification as of 2018:
landgrant: If it is a landgrant institution or not.sector: Whether the institution is public or non-for-profit private.Y <- "Total.FTE.staff..DRVHR2018."
X <- "Instructional..research.and.public.service.FTE..DRVHR2018."
regression <- linregression_traditional(data,X,Y,carnegie)
summary_reg <- summary(regression)
rsquared <- summary_reg$r.squared
coeffs <- summary_reg$coefficients
ggplot2 instead: nicer plotsfoo <- ggplotRegression(data,Y,X,carnegie,sector,ID,SZ,'US Doctoral Universities')
foo[[1]]
summary_reg <- summary(foo[[2]])
rsquared <- summary_reg$r.squared
coeffs <- summary_reg$coefficients
plotly to get an interactive plot!plotly works well with ggplot2.ggplotly(foo[[1]], tooltip = c('id', 'enroll', 'x', 'y'))
phantomJS.library(webshot)
webshot::install_phantomjs()
X <- "Undergraduate.enrollment..DRVEF2018."
Y <- "Graduate.enrollment..DRVEF2018."
foo <- ggplotRegression(data,Y,X,carnegie,sector,ID,SZ,'US Doctoral Universities')
foo[[1]]
summary_reg <- summary(foo[[2]])
rsquared <- summary_reg$r.squared
coeffs <- summary_reg$coefficients
X <- "Full.time.undergraduate.enrollment..DRVEF2018."
Y <- "Full.time.graduate.enrollment..DRVEF2018."
foo <- ggplotRegression(data,Y,X,carnegie,sector,ID,SZ,'US Doctoral Universities')
foo[[1]]
summary_reg <- summary(foo[[2]])
rsquared <- summary_reg$r.squared
coeffs <- summary_reg$coefficients
p <- ggplotJitter(data, PhD, carnegie, sector, landgrant, ID, SZ)
ggplotly(p, tooltip = c('id', 'enroll', 'y'))
df <- data.frame(
data[c(carnegie,PhD,ID,SZ,sector,landgrant)],
Students.receiving.a.PhD.normalized..DRVC2018. = as.vector(unlist(data[PhD]/data[GradT]))*100
)
Normalized <- "Students.receiving.a.PhD.normalized..DRVC2018."
p <- ggplotJitter(df, Normalized, carnegie, sector, landgrant, ID, SZ)
ggplotly(p, tooltip = c('id', 'enroll', 'y'))